{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"3\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import youtokentome as yttm\n",
    "import json\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.naive_bayes import ComplementNB\n",
    "import numpy as np\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('bow-language-detection.pkl', 'rb') as fopen:\n",
    "    bow = pickle.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['train_X', 'test_X', 'train_Y', 'test_Y'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('train-test.json') as fopen:\n",
    "    train_test = json.load(fopen)\n",
    "    \n",
    "train_test.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_Y = LabelEncoder().fit_transform(train_test['test_Y'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "bpe = yttm.BPE(model='language-detection.model')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4729650"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_subs = [' '.join(s) for s in bpe.encode(train_test['test_X'], output_type=yttm.OutputType.SUBWORD)]\n",
    "len(test_subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_X = bow.transform(test_subs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def convert_sparse_matrix_to_sparse_tensor(X):\n",
    "    coo = X.tocoo()\n",
    "    indices = np.mat([coo.row, coo.col]).transpose()\n",
    "    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "class Model:\n",
    "    def __init__(self, learning_rate, dimension = 32, output = 6):\n",
    "        self.X = tf.sparse_placeholder(tf.int32)\n",
    "        self.W = tf.sparse_placeholder(tf.int32)\n",
    "        self.Y = tf.placeholder(tf.int32, [None])\n",
    "        embeddings = tf.Variable(tf.truncated_normal([400000,dimension]))\n",
    "        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')\n",
    "        self.embed = embed\n",
    "        self.logits = tf.layers.dense(embed, output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/ops/embedding_ops.py:515: div (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Deprecated in favor of operator or tf.math.divide.\n",
      "WARNING:tensorflow:From <ipython-input-11-1cf05e8a87f2>:9: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.Dense instead.\n",
      "WARNING:tensorflow:From /home/husein/.local/lib/python3.6/site-packages/tensorflow_core/python/layers/core.py:187: Layer.apply (from tensorflow.python.keras.engine.base_layer) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use `layer.__call__` method instead.\n"
     ]
    }
   ],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = Model(1e-3)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "INFO:tensorflow:Restoring parameters from lang-detection-w/model.ckpt\n"
     ]
    }
   ],
   "source": [
    "saver = tf.train.Saver(tf.trainable_variables())\n",
    "saver.restore(sess, 'lang-detection-w/model.ckpt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 36951/36951 [01:00<00:00, 612.88it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "batch_size = 128\n",
    "predicted_Y = []\n",
    "\n",
    "for i in tqdm(range(0, test_X.shape[0], batch_size)):\n",
    "    index = min(i + batch_size, test_X.shape[0])\n",
    "    batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i: index])\n",
    "    results = sess.run(model.logits, feed_dict = {\n",
    "                model.X: batch_x[0],\n",
    "                model.W: batch_x[1],\n",
    "            })\n",
    "    predicted_Y.extend(np.argmax(results, axis = 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test_Y) == len(predicted_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "         eng    0.96760   0.97401   0.97080    553739\n",
      "         ind    0.97635   0.96131   0.96877    576059\n",
      "       malay    0.96985   0.98498   0.97736   1800649\n",
      "    manglish    0.98036   0.96569   0.97297    181442\n",
      "       other    0.99641   0.99627   0.99634   1428083\n",
      "       rojak    0.94221   0.84302   0.88986    189678\n",
      "\n",
      "    accuracy                        0.97779   4729650\n",
      "   macro avg    0.97213   0.95421   0.96268   4729650\n",
      "weighted avg    0.97769   0.97779   0.97760   4729650\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\n",
    "    metrics.classification_report(\n",
    "        test_Y,\n",
    "        predicted_Y,\n",
    "        target_names = ['eng', 'ind', 'malay', 'manglish', 'other', 'rojak'],\n",
    "        digits = 5\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
